Similarty matrix


In [20]:
import numpy as np
import scipy as sc
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [6]:
np.random.rand(3,2)


Out[6]:
array([[ 0.20099636,  0.05064982],
       [ 0.99286728,  0.39290391],
       [ 0.1397375 ,  0.0186377 ]])

In [5]:
df = pd.read_csv('dengue_neo.csv')

In [6]:
df.head()


Out[6]:
Unnamed: 0 Authors Title Year Source.title Volume Issue Art..No. Page.start Page.end ... Conference.code ISSN ISBN CODEN PubMed.ID Language.of.Original.Document Abbreviated.Source.Title Document.Type Source EID
0 1 Gouvêa M.M.; Jr. Time-spatial model on the dynamics of the prol... 2017.0 Communications in Nonlinear Science and Numeri... 44 NaN NaN 130 143 ... NaN 10075704 NaN NaN NaN English Comm. Nonlinear Sci. Numer. Simul. Article Scopus 2-s2.0-84981507175
1 3 Wang X.; Tang S.; Cheke R.A. A stage structured mosquito model incorporatin... 2016.0 Journal of Theoretical Biology 411 NaN NaN 27 36 ... NaN 225193 NaN JTBIA NaN English J. Theor. Biol. Article Scopus 2-s2.0-84991688005
2 4 Tang B.; Xiao Y.; Tang S.; Wu J. Modelling weekly vector control against Dengue... 2016.0 Journal of Theoretical Biology 410 NaN NaN 65 76 ... NaN 225193 NaN JTBIA NaN English J. Theor. Biol. Article Scopus 2-s2.0-84988473946
3 5 Delmelle E.; Hagenlocher M.; Kienberger S.; Ca... A spatial model of socioeconomic and environme... 2016.0 Acta Tropica 164 NaN NaN 169 176 ... NaN 0001706X NaN ACTRA NaN English Acta Trop. Article Scopus 2-s2.0-84988037437
4 6 Rodrigues H.S.; Monteiro M.T.T.; Torres D.F.M. Seasonality effects on dengue: basic reproduct... 2016.0 Mathematical Methods in the Applied Sciences 39 16 NaN 4671 4679 ... NaN 1704214 NaN MMSCD NaN English Math Methods Appl Sci Conference Paper Scopus 2-s2.0-84920399968

5 rows × 42 columns


In [7]:
print(df.columns.values)


['Unnamed: 0' 'Authors' 'Title' 'Year' 'Source.title' 'Volume' 'Issue'
 'Art..No.' 'Page.start' 'Page.end' 'Page.count' 'Cited.by' 'DOI' 'Link'
 'Affiliations' 'Authors.with.affiliations' 'Abstract' 'Author.Keywords'
 'Index.Keywords' 'Molecular.Sequence.Numbers' 'Chemicals.CAS' 'Tradenames'
 'Manufacturers' 'Funding.Details' 'References' 'Correspondence.Address'
 'Editors' 'Sponsors' 'Publisher' 'Conference.name' 'Conference.date'
 'Conference.location' 'Conference.code' 'ISSN' 'ISBN' 'CODEN' 'PubMed.ID'
 'Language.of.Original.Document' 'Abbreviated.Source.Title' 'Document.Type'
 'Source' 'EID']

In [8]:


In [9]:
t0


Out[9]:
'Some complex physical systems, such as cellular regulation, ecosystems, and societies, can be represented by local interactions between agents. Then, complex behaviors may emerge. A cellular automaton is a discrete dynamic system with these features. Among the several complex systems, epidemic diseases are given special attention by researchers with respect to their dynamics. Understanding the behavior of an epidemic may well benefit a society. For instance, different proliferation scenarios may be produced and a prevention policy set. This paper presents a new simulation method of the time-spatial spread of the Dengue mosquito with a cellular automaton. Thus, it will be possible to create different dissemination scenarios and preventive policies for these in several regions. Simulations were performed with different initial conditions and parameters as a result of which the behavior of the proposed method was characterized. © 2016 Elsevier B.V.'

In [10]:
tokens0 = nltk.wordpunct_tokenize(t0)

In [11]:
tokens0


Out[11]:
['Some',
 'complex',
 'physical',
 'systems',
 ',',
 'such',
 'as',
 'cellular',
 'regulation',
 ',',
 'ecosystems',
 ',',
 'and',
 'societies',
 ',',
 'can',
 'be',
 'represented',
 'by',
 'local',
 'interactions',
 'between',
 'agents',
 '.',
 'Then',
 ',',
 'complex',
 'behaviors',
 'may',
 'emerge',
 '.',
 'A',
 'cellular',
 'automaton',
 'is',
 'a',
 'discrete',
 'dynamic',
 'system',
 'with',
 'these',
 'features',
 '.',
 'Among',
 'the',
 'several',
 'complex',
 'systems',
 ',',
 'epidemic',
 'diseases',
 'are',
 'given',
 'special',
 'attention',
 'by',
 'researchers',
 'with',
 'respect',
 'to',
 'their',
 'dynamics',
 '.',
 'Understanding',
 'the',
 'behavior',
 'of',
 'an',
 'epidemic',
 'may',
 'well',
 'benefit',
 'a',
 'society',
 '.',
 'For',
 'instance',
 ',',
 'different',
 'proliferation',
 'scenarios',
 'may',
 'be',
 'produced',
 'and',
 'a',
 'prevention',
 'policy',
 'set',
 '.',
 'This',
 'paper',
 'presents',
 'a',
 'new',
 'simulation',
 'method',
 'of',
 'the',
 'time',
 '-',
 'spatial',
 'spread',
 'of',
 'the',
 'Dengue',
 'mosquito',
 'with',
 'a',
 'cellular',
 'automaton',
 '.',
 'Thus',
 ',',
 'it',
 'will',
 'be',
 'possible',
 'to',
 'create',
 'different',
 'dissemination',
 'scenarios',
 'and',
 'preventive',
 'policies',
 'for',
 'these',
 'in',
 'several',
 'regions',
 '.',
 'Simulations',
 'were',
 'performed',
 'with',
 'different',
 'initial',
 'conditions',
 'and',
 'parameters',
 'as',
 'a',
 'result',
 'of',
 'which',
 'the',
 'behavior',
 'of',
 'the',
 'proposed',
 'method',
 'was',
 'characterized',
 '.',
 '©',
 '2016',
 'Elsevier',
 'B',
 '.',
 'V',
 '.']

In [12]:
nltk_to = nltk.Text(t0)

In [13]:
words = [w.lower() for w in nltk_to]
vocab = sorted(set(words))
type(vocab)


Out[13]:
list

In [14]:
from nltk.corpus import stopwords
filtered_words_T0 = [w for w in tokens0 if not w in stopwords.words('english')]

In [15]:
print(filtered_words_T0)


['Some', 'complex', 'physical', 'systems', ',', 'cellular', 'regulation', ',', 'ecosystems', ',', 'societies', ',', 'represented', 'local', 'interactions', 'agents', '.', 'Then', ',', 'complex', 'behaviors', 'may', 'emerge', '.', 'A', 'cellular', 'automaton', 'discrete', 'dynamic', 'system', 'features', '.', 'Among', 'several', 'complex', 'systems', ',', 'epidemic', 'diseases', 'given', 'special', 'attention', 'researchers', 'respect', 'dynamics', '.', 'Understanding', 'behavior', 'epidemic', 'may', 'well', 'benefit', 'society', '.', 'For', 'instance', ',', 'different', 'proliferation', 'scenarios', 'may', 'produced', 'prevention', 'policy', 'set', '.', 'This', 'paper', 'presents', 'new', 'simulation', 'method', 'time', '-', 'spatial', 'spread', 'Dengue', 'mosquito', 'cellular', 'automaton', '.', 'Thus', ',', 'possible', 'create', 'different', 'dissemination', 'scenarios', 'preventive', 'policies', 'several', 'regions', '.', 'Simulations', 'performed', 'different', 'initial', 'conditions', 'parameters', 'result', 'behavior', 'proposed', 'method', 'characterized', '.', '©', '2016', 'Elsevier', 'B', '.', 'V', '.']

In [16]:
filtered_words_t0 = [w.lower() for w in filtered_words_T0 if w.isalnum()]
print(filtered_words_t0)


['some', 'complex', 'physical', 'systems', 'cellular', 'regulation', 'ecosystems', 'societies', 'represented', 'local', 'interactions', 'agents', 'then', 'complex', 'behaviors', 'may', 'emerge', 'a', 'cellular', 'automaton', 'discrete', 'dynamic', 'system', 'features', 'among', 'several', 'complex', 'systems', 'epidemic', 'diseases', 'given', 'special', 'attention', 'researchers', 'respect', 'dynamics', 'understanding', 'behavior', 'epidemic', 'may', 'well', 'benefit', 'society', 'for', 'instance', 'different', 'proliferation', 'scenarios', 'may', 'produced', 'prevention', 'policy', 'set', 'this', 'paper', 'presents', 'new', 'simulation', 'method', 'time', 'spatial', 'spread', 'dengue', 'mosquito', 'cellular', 'automaton', 'thus', 'possible', 'create', 'different', 'dissemination', 'scenarios', 'preventive', 'policies', 'several', 'regions', 'simulations', 'performed', 'different', 'initial', 'conditions', 'parameters', 'result', 'behavior', 'proposed', 'method', 'characterized', '2016', 'elsevier', 'b', 'v']

In [19]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(filtered_words_t0)
X_train_counts


Out[19]:
<91x73 sparse matrix of type '<class 'numpy.int64'>'
	with 88 stored elements in Compressed Sparse Row format>

In [21]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape


Out[21]:
(91, 73)

In [22]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape


Out[22]:
(91, 73)

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

train_set = df['Abstract'].tolist()

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set)  #finds the tfidf score with normalization
print ("cosine scores ==> ",cosine_similarity(tfidf_matrix_train, tfidf_matrix_train))


cosine scores ==>  [[ 1.          0.12911647  0.09299782 ...,  0.10117668  0.16888597
   0.04722414]
 [ 0.12911647  1.          0.22658251 ...,  0.13396575  0.17346689
   0.05257748]
 [ 0.09299782  0.22658251  1.         ...,  0.08378385  0.15058275
   0.05015608]]

In [31]:
rtn = cosine_similarity(tfidf_matrix_train, tfidf_matrix_train)

In [37]:
print(rtn[0])


[ 1.          0.12911647  0.09299782 ...,  0.10117668  0.16888597
  0.04722414]

In [40]:
np.savetxt('docsim10.csv', rtn[0:10])

In [ ]: